library(tidyverse)
library(readxl)
library(lme4)
library(lmerTest)
raw_eab_data <- read_excel("emerald-ash-borer-surveillance-data-2002-to-2020-en.xlsx")
# change column names from all uppercase to all lowercase to make it easier to work with
colnames(raw_eab_data) <- c("latitude", "longitude", "survey", "year",
"province", "community", "result")
# separate dataset for Ontario only and keep only necessary columns
ont_eab_data <- raw_eab_data %>% subset(province == "ONTARIO")
ont_eab_data <- ont_eab_data[, c(1, 2, 4, 6, 7)]
# remove NAs from community for future matching
ont_eab_data <- ont_eab_data %>% subset(!is.na(community))
Some of community names in our EAB data include designations that are common to more than one community (ex. County, District, etc) and although some of the census data names also have this feature, we can’t guess whether the corresponding communities will be formatted the same way (ex. will “Bruce County” in our EAB data be written as “Bruce” or “Bruce County” in the census?). However, the designation may help us in our report later on when we have to talk about the different communities. That’s why instead of changing our existing community names, I created a new column of the names with the designations removed. We will also do the same thing to the census data after downloading it.
To determine whether the communities in our EAB data are urban or rural, we decided to use the Canadian 2021 census from Statistics Canada. We found a dataset containing only census information for population centres in Canada. According to StatCan, population centres are communities that have a population of at least 1000 people and a population density of at least 400 people per square kilometre. Locations that don’t meet these requirements are considered rural. If the communities from our EAB data are found in the population centres data, we can identify them as urban, and if they are missing, they will be rural. Since the dataset is for all of Canada, we will also subset to only population centres in Ontario.
# download census 2021 data for population centres and unzip file
temp <- tempfile()
download.file("https://www150.statcan.gc.ca/n1/tbl/csv/98100011-eng.zip", temp)
trying URL 'https://www150.statcan.gc.ca/n1/tbl/csv/98100011-eng.zip'
Content type 'application/zip' length 94350 bytes (92 KB)
==================================================
downloaded 92 KB
census_2021_pop_centres <- read.csv(unz(temp, "98100011.csv"))
unlink(temp)
# subset dataframe to Ontario only and
# keep name, population, and population density columns
## to exclude the rows for the provinces themselves, the chosen rows are
## 1 after the Ontario row and 1 before the Manitoba row
census_2021_pop_centres <- census_2021_pop_centres[380:681, c(2, 3, 5, 17)]
# simplify column names
colnames(census_2021_pop_centres) <- c("community_name", "DGUID",
"population_2021", "pop_density_2021")
# new column with designations removed
census_2021_pop_centres <- census_2021_pop_centres %>%
filter(!is.na(community_name)) %>%
mutate(community_name_2 = str_replace_all(census_2021_pop_centres$community_name,
c(" County" = "", " District" = "", " Division" = "",
" Regional Municipality" = "", " Municipality" = "")))
Now that we have data for the urban communities, the next step is to match the community names from our EAB data to the census data and determine the community type. We made an empty column ‘community_type’ in our EAB data so that we can use a for loop to populate it.
ont_eab_data$community_type <- NA # new column for community type
for (i in 1:nrow(ont_eab_data)) { # look through every row in our EAB data
j <- which(tolower(ont_eab_data$community_2[i]) ==
tolower(census_2021_pop_centres$community_name_2))
# find this row's community in column of census data community names
# searching for identical names -> NO partial matches
# returns row number of matching name or integer (0) if no matching name found
if (!identical(j, integer (0))) { # if row number returned
ont_eab_data$community_type[i] <- "urban"
} else {
ont_eab_data$community_type[i] <- "rural"
}
}
ont_eab_data$avg_temp <- NA # new column for temperature
for (i in 1:nrow(ont_eab_data)) { # look through every row in our EAB data
k <- which(tolower(ont_eab_data$community[i]) ==
tolower(temperature_data$Community))
# return row number of matching community name
ont_eab_data$avg_temp[i] <- temperature_data$Avg_Temp[k]
}
Has EAB abundance changed over the years in different Ontario communities?
anova(time_community_model, test="Chi")
Analysis of Deviance Table
Model: binomial, link: logit
Response: as.factor(result)
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 38000 8649.4
year 1 0.40 37999 8649.0 0.5271
community 153 2255.84 37846 6393.1 <2e-16 ***
year:community 106 366.14 37740 6027.0 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(time_glm_model, test = "Chi")
Analysis of Deviance Table
Model: binomial, link: logit
Response: as.factor(result)
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 38000 8649.4
year 1 0.40001 37999 8649.0 0.5271
# effect of latitude and longitude and their interaction on EAB detection
coordinates_glm_model <- glm(as.factor(result)~year * latitude * longitude,
family = binomial, data = ont_eab_data)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(coordinates_glm_model)
Call:
glm(formula = as.factor(result) ~ year * latitude * longitude,
family = binomial, data = ont_eab_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.469e+05 2.656e+04 16.82 <2e-16 ***
year -2.221e+02 1.319e+01 -16.84 <2e-16 ***
latitude -9.534e+03 5.867e+02 -16.25 <2e-16 ***
longitude 5.387e+03 3.146e+02 17.12 <2e-16 ***
year:latitude 4.738e+00 2.911e-01 16.27 <2e-16 ***
year:longitude -2.677e+00 1.562e-01 -17.14 <2e-16 ***
latitude:longitude -1.150e+02 6.954e+00 -16.54 <2e-16 ***
year:latitude:longitude 5.716e-02 3.450e-03 16.57 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8649.4 on 38000 degrees of freedom
Residual deviance: 8014.2 on 37993 degrees of freedom
AIC: 8030.2
Number of Fisher Scoring iterations: 9
# latitude and longitude and their interaction on community type
coord_type_model <- glm(as.factor(community_type)~latitude * longitude,
family = binomial, data = ont_eab_data)
summary(coord_type_model)
Call:
glm(formula = as.factor(community_type) ~ latitude * longitude,
family = binomial, data = ont_eab_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.366e+02 1.453e+01 -23.16 <2e-16 ***
latitude 7.006e+00 3.055e-01 22.93 <2e-16 ***
longitude -3.828e+00 1.747e-01 -21.91 <2e-16 ***
latitude:longitude 7.952e-02 3.665e-03 21.70 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 33928 on 38000 degrees of freedom
Residual deviance: 30269 on 37997 degrees of freedom
AIC: 30277
Number of Fisher Scoring iterations: 4
# latitude and longitude and their interaction on mean temperature
coord_temp_model <- lm(avg_temp ~ latitude * longitude, data = ont_eab_data)
summary(coord_temp_model)
Call:
lm(formula = avg_temp ~ latitude * longitude, data = ont_eab_data)
Residuals:
Min 1Q Median 3Q Max
-4.2880 -0.3075 0.2186 0.4849 6.3558
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.144e+02 4.678e+00 -45.83 <2e-16 ***
latitude 4.401e+00 9.987e-02 44.07 <2e-16 ***
longitude -3.344e+00 5.596e-02 -59.76 <2e-16 ***
latitude:longitude 6.804e-02 1.192e-03 57.08 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8742 on 37997 degrees of freedom
Multiple R-squared: 0.8355, Adjusted R-squared: 0.8355
F-statistic: 6.435e+04 on 3 and 37997 DF, p-value: < 2.2e-16
# effect of community type on EAB abundance
types_result <- table(ont_eab_data$community_type, ont_eab_data$result)
types_result
DETECTED NOT DETECTED
rural 458 31307
urban 460 5776
chisq.test(ont_eab_data$community_type, ont_eab_data$result, correct = F)
Pearson's Chi-squared test
data: ont_eab_data$community_type and ont_eab_data$result
X-squared = 778.8, df = 1, p-value < 2.2e-16
# effect of community type on EAB detection over the years
type_model <- glm(as.factor(result)~year * community_type,
family = binomial, data = ont_eab_data)
summary(type_model)
Call:
glm(formula = as.factor(result) ~ year * community_type, family = binomial,
data = ont_eab_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 240.25054 26.60605 9.030 <2e-16 ***
year -0.11766 0.01326 -8.873 <2e-16 ***
community_typeurban -444.33419 35.84048 -12.398 <2e-16 ***
year:community_typeurban 0.22066 0.01787 12.349 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8649.4 on 38000 degrees of freedom
Residual deviance: 7920.9 on 37997 degrees of freedom
AIC: 7928.9
Number of Fisher Scoring iterations: 7
temp_model <- glm(as.factor(result)~avg_temp,
family = binomial, data = ont_eab_data)
summary(temp_model)
Call:
glm(formula = as.factor(result) ~ avg_temp, family = binomial,
data = ont_eab_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.55926 0.21630 25.702 <2e-16 ***
avg_temp -0.20102 0.02241 -8.969 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8649.4 on 38000 degrees of freedom
Residual deviance: 8545.3 on 37999 degrees of freedom
AIC: 8549.3
Number of Fisher Scoring iterations: 7
summary(temp_type_model)
Call:
glm(formula = as.factor(result) ~ avg_temp * community_type,
family = binomial, data = ont_eab_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.47027 0.27186 16.443 < 2e-16 ***
avg_temp -0.02670 0.02901 -0.921 0.35728
community_typeurban 1.21434 0.38997 3.114 0.00185 **
avg_temp:community_typeurban -0.32738 0.04038 -8.108 5.15e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8649.4 on 38000 degrees of freedom
Residual deviance: 7786.2 on 37997 degrees of freedom
AIC: 7794.2
Number of Fisher Scoring iterations: 7
result_colour <- c("#FF0000", "#3333CC")
ggplot(ont_eab_data, aes(x = year, fill = result)) +
geom_histogram(stat = "count") +
scale_x_continuous(breaks = seq(2002, 2020, by = 2)) +
theme_bw() +
scale_fill_manual(values = result_colour) +
labs(title = "EAB Detection Throughout Time", x = "Year", y = "Observation Count", fill = "Detection Result")
Warning: Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
As you can see, the observations are skewed through the years. The greatest number of observations happened between the years 2003 and 2010, which corresponds with some of the first recorded incidences of the EAB in North America. The most-important takeaway from this graph is the frequency of observations within this dataset changed significantly more over time than the abundance of EABs did. Furthermore, this change in observation frequency over the years makes it harder to compare and contrast detection rate through the years.
ggplot(ont_eab_data, aes(x = longitude, y = latitude, color = result)) +
geom_point(size = 0.8, alpha = 0.6) +
theme_bw() +
scale_color_manual(values = result_colour) +
labs(title = "EAB Detection in Ontario by Longitude & Latitude", x = "Longitude", y = "Latitude", color = "Detection Result")
EAB detection is concentrated in one general area!
ggplot(ont_eab_data, aes(x = longitude, y = latitude, color = community_type)) +
geom_point(size = 0.8, alpha = 0.6) +
theme_bw() +
scale_color_manual(values = colour) +
labs(title = "Community Type in Ontario by Longitude & Latitude", x = "Longitude", y = "Latitude", color = "Community Type")
There IS a correlation between coordinates and community type
ggplot(ont_eab_data, aes(x = longitude, y = latitude, color = avg_temp)) +
geom_point(size = 1, alpha = 0.3) +
theme_bw() +
labs(title = "Annual Average Temperature in Ontario by Longitude & Latitude", x = "Longitude", y = "Latitude", color = "Average Annual Temperature (˚C)")
ont_eab_data %>%
ggplot(aes(x = year, fill = community_type)) +
geom_histogram(binwidth = 1.2) +
facet_wrap(~result, scales = "free_y") +
scale_x_continuous(breaks = seq(2002, 2020, by = 2)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_manual(values = colour) +
labs(title = "EAB Detection by Year and Community Type", x = "Year", y = "Number of Observations",
fill = "Community Type")
Note the differences in y-axis scales between “Detected” and “Not Detected”, as well as the significantly higher number of observations in the earlier half of the dataset.
ggplot(ont_eab_data, aes(x = result, y = year, fill = result)) +
geom_boxplot() +
geom_point(size = 0.6, alpha = 0.4) + facet_wrap(~community_type) +
theme_bw() +
scale_fill_manual(values = result_colour) +
labs(title = "EAB Detection by Year and Community Type", x = "", y = "Year", fill = "Detection Result")
ggplot(ont_eab_data, aes(x = result, y = year, fill = result)) +
geom_violin() +
facet_wrap(~community_type) +
theme_bw() +
scale_fill_manual(values = result_colour) +
labs(title = "EAB Detection by Year and Community Type", x = "", y = "Year", fill = "Detection Result")
ggplot(ont_eab_data, aes(x = community_type, y = year, fill = community_type)) +
geom_violin() +
facet_wrap(~result) +
theme_bw() +
scale_fill_manual(values = colour) +
labs(title = "EAB Detection by Year and Community Type", x = "", y = "Year", fill = "Community Type")
ggplot(ont_eab_data, aes(x = year, y = community_type, fill = community_type)) +
geom_violin() +
facet_wrap(~result) +
theme_bw() +
scale_x_continuous(breaks = seq(2002, 2020, by = 2)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_manual(values = colour) +
labs(title = "EAB Detection by Year and Community Type", x = "Year", y = "", fill = "Community Type")
community_count <- ont_eab_data %>%
count(community, wt = NULL, sort = TRUE)
total_dataframe <- cbind(temperature_data, community_count)
most_observations <- total_dataframe %>%
filter(n >= 200)
ggplot(most_observations, aes(x = community, y = n)) +
geom_point() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "Community", y = "Observations per Community", title = "Number of Observations from Each Community (n ≥ 200)")
Chatham-Kent Division (rural)
ont_eab_data %>%
filter(community == "CHATHAM-KENT DIVISION") %>%
ggplot(aes(x = year, fill = result)) +
geom_histogram(stat = "count") +
theme_bw() +
scale_fill_manual(values = result_colour) +
labs(title = "EAB Detection by Year in the Chatham-Kent Division", x = "Year", y = "Number of Observations", fill = "Detection Result")
Warning: Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Our dataset is dominated by a couple of communities especially during the early years of data collection.
ggplot(ont_eab_data, aes(x = year, y = avg_temp, color = result)) +
geom_point() +
geom_smooth(method = "glm", family = "binomial", se = FALSE) +
scale_color_manual(values = result_colour) +
theme_bw() +
labs(title = "EAB Detection Through Time as a Function of Temperature", x = "Year", y = "Average Temperature (˚C)", color = "Detection Result")
Warning: Ignoring unknown parameters: `family`
ggplot(ont_eab_data, aes(x = year, y = avg_temp, color = result)) +
geom_point() +
facet_wrap(~community_type) +
theme_bw() +
scale_x_continuous(breaks = seq(2002, 2020, by = 2)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_color_manual(values = result_colour) +
labs(title = "EAB Detection by Year, Temperature and Community Type", x = "Year", y = "Average Annual Temperature (˚C)", color = "Detection Result")
colour <- c(rural = "forestgreen", urban = "darkgrey")
ggplot(ont_eab_data, aes(x = year, y = avg_temp, color = community_type)) +
geom_point() +
facet_wrap(~result) +
scale_color_manual(values = colour) +
theme_bw() +
geom_smooth(method = "lm", se = FALSE) +
scale_x_continuous(breaks = seq(2002, 2020, by = 2)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(title = "EAB Detection by Year, Temperature and Community Type", x = "Year", y = "Average Annual Temperature (˚C)", color = "Community Type")
kruskal.test(result ~ avg_temp*community,
data = ont_eab_data)
Error in kruskal.test.formula(result ~ avg_temp * community, data = ont_eab_data) :
'formula' should be of the form response ~ group
temp_model <- glm(as.factor(result)~ avg_temp + (1|community), family = "binomial", data = ont_eab_data)
Error in 1 | community :
operations are possible only for numeric, logical or complex types